etree_lxml.py 6.2 KB


  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import text_type
  3. from collections import OrderedDict
  4. from lxml import etree
  5. from ..treebuilders.etree import tag_regexp
  6. from . import base
  7. from .. import _ihatexml
  8. def ensure_str(s):
  9. if s is None:
  10. return None
  11. elif isinstance(s, text_type):
  12. return s
  13. else:
  14. return s.decode("ascii", "strict")
  15. class Root(object):
  16. def __init__(self, et):
  17. self.elementtree = et
  18. self.children = []
  19. try:
  20. if et.docinfo.internalDTD:
  21. self.children.append(Doctype(self,
  22. ensure_str(et.docinfo.root_name),
  23. ensure_str(et.docinfo.public_id),
  24. ensure_str(et.docinfo.system_url)))
  25. except AttributeError:
  26. pass
  27. try:
  28. node = et.getroot()
  29. except AttributeError:
  30. node = et
  31. while node.getprevious() is not None:
  32. node = node.getprevious()
  33. while node is not None:
  34. self.children.append(node)
  35. node = node.getnext()
  36. self.text = None
  37. self.tail = None
  38. def __getitem__(self, key):
  39. return self.children[key]
  40. def getnext(self):
  41. return None
  42. def __len__(self):
  43. return 1
  44. class Doctype(object):
  45. def __init__(self, root_node, name, public_id, system_id):
  46. self.root_node = root_node
  47. self.name = name
  48. self.public_id = public_id
  49. self.system_id = system_id
  50. self.text = None
  51. self.tail = None
  52. def getnext(self):
  53. return self.root_node.children[1]
  54. class FragmentRoot(Root):
  55. def __init__(self, children):
  56. self.children = [FragmentWrapper(self, child) for child in children]
  57. self.text = self.tail = None
  58. def getnext(self):
  59. return None
  60. class FragmentWrapper(object):
  61. def __init__(self, fragment_root, obj):
  62. self.root_node = fragment_root
  63. self.obj = obj
  64. if hasattr(self.obj, 'text'):
  65. self.text = ensure_str(self.obj.text)
  66. else:
  67. self.text = None
  68. if hasattr(self.obj, 'tail'):
  69. self.tail = ensure_str(self.obj.tail)
  70. else:
  71. self.tail = None
  72. def __getattr__(self, name):
  73. return getattr(self.obj, name)
  74. def getnext(self):
  75. siblings = self.root_node.children
  76. idx = siblings.index(self)
  77. if idx < len(siblings) - 1:
  78. return siblings[idx + 1]
  79. else:
  80. return None
  81. def __getitem__(self, key):
  82. return self.obj[key]
  83. def __bool__(self):
  84. return bool(self.obj)
  85. def getparent(self):
  86. return None
  87. def __str__(self):
  88. return str(self.obj)
  89. def __unicode__(self):
  90. return str(self.obj)
  91. def __len__(self):
  92. return len(self.obj)
  93. class TreeWalker(base.NonRecursiveTreeWalker):
  94. def __init__(self, tree):
  95. # pylint:disable=redefined-variable-type
  96. if isinstance(tree, list):
  97. self.fragmentChildren = set(tree)
  98. tree = FragmentRoot(tree)
  99. else:
  100. self.fragmentChildren = set()
  101. tree = Root(tree)
  102. base.NonRecursiveTreeWalker.__init__(self, tree)
  103. self.filter = _ihatexml.InfosetFilter()
  104. def getNodeDetails(self, node):
  105. if isinstance(node, tuple): # Text node
  106. node, key = node
  107. assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
  108. return base.TEXT, ensure_str(getattr(node, key))
  109. elif isinstance(node, Root):
  110. return (base.DOCUMENT,)
  111. elif isinstance(node, Doctype):
  112. return base.DOCTYPE, node.name, node.public_id, node.system_id
  113. elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
  114. return base.TEXT, ensure_str(node.obj)
  115. elif node.tag == etree.Comment:
  116. return base.COMMENT, ensure_str(node.text)
  117. elif node.tag == etree.Entity:
  118. return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
  119. else:
  120. # This is assumed to be an ordinary element
  121. match = tag_regexp.match(ensure_str(node.tag))
  122. if match:
  123. namespace, tag = match.groups()
  124. else:
  125. namespace = None
  126. tag = ensure_str(node.tag)
  127. attrs = OrderedDict()
  128. for name, value in list(node.attrib.items()):
  129. name = ensure_str(name)
  130. value = ensure_str(value)
  131. match = tag_regexp.match(name)
  132. if match:
  133. attrs[(match.group(1), match.group(2))] = value
  134. else:
  135. attrs[(None, name)] = value
  136. return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
  137. attrs, len(node) > 0 or node.text)
  138. def getFirstChild(self, node):
  139. assert not isinstance(node, tuple), "Text nodes have no children"
  140. assert len(node) or node.text, "Node has no children"
  141. if node.text:
  142. return (node, "text")
  143. else:
  144. return node[0]
  145. def getNextSibling(self, node):
  146. if isinstance(node, tuple): # Text node
  147. node, key = node
  148. assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
  149. if key == "text":
  150. # XXX: we cannot use a "bool(node) and node[0] or None" construct here
  151. # because node[0] might evaluate to False if it has no child element
  152. if len(node):
  153. return node[0]
  154. else:
  155. return None
  156. else: # tail
  157. return node.getnext()
  158. return (node, "tail") if node.tail else node.getnext()
  159. def getParentNode(self, node):
  160. if isinstance(node, tuple): # Text node
  161. node, key = node
  162. assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
  163. if key == "text":
  164. return node
  165. # else: fallback to "normal" processing
  166. elif node in self.fragmentChildren:
  167. return None
  168. return node.getparent()